
"""
Do your changes here
build your preprocessing function here
"""
import re
from gensim import utils    
import nltk   
porter = nltk.PorterStemmer()

def pp(line):
    #line="hello hi"
    #print line
    line = replace_html_entities(line)
    line = replace_special_chars(line)
    #line = line.replcae("\n", " ");
    #print line,"\n=============="
    #exit()
    return line
 
def replace_html_entities(text):
    text = text.replace('&lt;','<')
    text = text.replace('&gt;','>')
    text = text.replace('&quot;','"')
    text = text.replace('&amp;','&')
    text = text.replace('&nbsp;',' ')
    #text =decode_htmlentities(text)
    return re.sub('<[^<]+?>', '', text)#    


def new_replace_special_chars(text):
    text = re.sub('[:?.\"@/\\(),\'=\[\];#%-<>!{}`~$|]',' ',text)
    text = re.sub('[\d+]',' ',text)
    text = re.sub('\s+',' ',text)
    text = text.lower()
    text = [porter.stem(token) for token in text.split()] 
    text = ' '.join(text)
    """ ;, -, digits """
    return text# 
 
def replace_special_chars(text):
    text = text.replace(':',' ')
    text = text.replace('?',' ')
    text = text.replace('.',' ')
    text = text.replace('\"',' ')
    text = text.replace('@',' ')
    text = text.replace('/',' ')
    text = text.replace('\\',' ')
    text = text.replace('(',' ')
    text = text.replace(')',' ')
    text = text.replace(',',' ')
    text = text.replace('\'',' ')
    text = text.replace('=',' ')
    text = text.replace('[',' ')
    text = text.replace(']',' ')
    text = text.replace(';',' ')
    text = text.replace('#',' ')
    text = text.replace('%',' ')
    text = text.replace('-',' ')
    text = text.replace('<',' ')
    text = text.replace('>',' ')
    text = text.replace('!',' ')
    text = text.replace('{',' ')
    text = text.replace('}',' ')
    text = text.replace('`',' ')
    text = text.replace('~',' ')
    text = text.replace('$',' ')
    text = text.replace('|',' ')
    text = text.replace('*',' ')
    text = text.replace('&',' ')
    text = text.replace('^',' ')
    #text = text.replace('__',' ') #It sis ok to keep it here
    text = re.sub('\s\d+\s',' ',text) #It will replace digist whihc have one space before and after 
    #text = re.sub('[\d+]',' ',text) # initial version to repplace all the digits
    text = re.sub('\s+',' ',text)
    text = text.lower()
    text = ' '.join([porter.stem(token) for token in text.split()]) 
    #text = ' '.join(text)
    """ ;, -, digits """
    return text#      